| SITUATION | Science versus Corona has means to create pedestrian models to simulate individual movement and crowd behavior by:
|
| COMPLICATION |
|
| REQUEST |
|
Installation of packages and processing data
suppressMessages(library(lubridate))
suppressMessages(library(tidyverse))
suppressMessages(library(data.table))
suppressMessages(library(scales))
suppressMessages(library(ggpubr))
suppressMessages(library(ggrepel))
suppressMessages(library(magrittr))
# keep time digits
op <- options(digits.secs = 6)
original_data <- read_csv("datapoint_all_experiments.csv")
## Rows: 2048137 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (6): id, tag_id, experiment_id, person_id, x, y
## dttm (1): timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
processed_data <- read_csv("datapoint_all_experiments_processed.csv")
## Rows: 1094023 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (19): person_id, id, tag_id, experiment_id, x, y, speed_x, speed_y, spe...
## lgl (2): slow, at_tablet
## dttm (1): timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tablet <- read.csv("tablet_positions.csv", sep=";") %>%
# add centre tablet to the data
add_row(tablet_id = 10, starting_point = "f", x = 3.33, y = 3.97)
task <- read_csv("person_tasks.csv")
## Rows: 11419 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): person_id, experiment_id, tablet_to_id, tablet_from_id
## dttm (2): start_time, end_time
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Keep in mind that a missing data point is defined as a 0.5 second (500 ms) time frame in which a tag has no recorded x and y coordinates. The data quality will be diagnosed in three steps:
Create data frames to fit the purpose of analysis
# remove 2022-10-18 data
original_data2 <- original_data %>%
mutate(timestamp = ymd_hms(timestamp),
# create a date variable
date = floor_date(timestamp, unit="day")) %>%
# remove October 18 as stipulated by client
filter(date != ymd("2022-10-18"))
# three data frames for different purposes
## main df group by tag_id, person_id, experiment_id
interact_df <- original_data2 %>%
group_by(tag_id, person_id, experiment_id) %>%
mutate(diff = ymd_hms(timestamp) - lag(ymd_hms(timestamp)))
### filter difference larger than 0.5 as NA is 0.5 second time frame in which a tag has no recorded x and y coordinates
NA_detect_df <- interact_df %>%
mutate(condition = ifelse(diff > 0.5, "NA", "Normal"))
### for plots with summary
summary_df <- interact_df %>%
drop_na() %>%
summarize(mean = mean(diff),
# for clearer display of numbers/ relative difference
mean_ms = mean(diff) * 100,
# for the skewd distribution
median = median(diff),
median_ms= median(diff) * 100)
## `summarise()` has grouped output by 'tag_id', 'person_id'. You can override
## using the `.groups` argument.
Check at which Hz the data contains location data for all participants.
# for positioning jitter points - no overlapping points
pos <- position_jitter(h = 0.05, w = 0.3, seed = 2)
mean_ms_interaction <- summary_df %>%
ggplot() +
# normal points with jitter
geom_jitter(
data = filter(summary_df, median_ms < 25.13),
mapping = aes(x = 1, y = median_ms),
alpha = 0.5,
colour = "#878787",
size = 2,
position = pos
) +
# highlight those points with high median ms
geom_jitter(
data = filter(summary_df, median_ms > 25.2),
mapping = aes(x = 1, y = median_ms, fill = "darkred"),
shape = 21,
size = 2,
position = pos
) +
# label tag x person id on those extraordinary points
geom_text_repel(
data = filter(summary_df, median_ms > 25.13),
mapping = aes(x = 1, y = median_ms,
label = paste("Tag", tag_id, "x Person", person_id)),
position = pos,
size = 5
) +
labs(title = "Tag ID x Person ID Interaction",
subtitle = "Median Seconds Per Data Point",
y = "Median Seconds") +
scale_y_continuous(labels = function(x){paste0(x, " ms")}) +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
text = element_text(size = 20),
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
# save in pdf
# ggsave("mean_ms_interaction.pdf", mean_ms_interaction, width = 15)
mean_ms_interaction
Missing data is identified first in the data frame created above, and the below plots will display results of:
Overall NAs in the data,
NAs per Tag ID,
NAs per Experiment ID
Showing what’s the overall trend of NAs in the data.
Overall_NA_p <- NA_detect_df %>%
drop_na() %>%
group_by(condition) %>%
summarize(n = n()) %>%
mutate(per = prop.table(n) * 100) %>%
ggplot(aes(x = condition, y = per,
label = c(paste(round(per[1], 4), "%"),
paste(round(per[2], 4), "%")))) +
geom_col(color = "black",
fill = c("darkred", "darkblue")) +
geom_text(vjust = -0.5,
size = 3) +
labs(title = "Overall NA Percentage",
x = "Group", y = "Percentage") +
scale_y_continuous(labels = function(x){paste0(x, " %")}) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
text = element_text(size = 20))
# In general there doesn't seem to be that much missing data
# NA percentage per Tag ID
spec_NA_df <- NA_detect_df %>%
group_by(tag_id, condition) %>%
summarize(n = n()) %>%
mutate(per = n/sum(n)) %>%
filter(condition == "NA") %>%
mutate(group = cut(per, breaks = c(-Inf, 0.000255, 0.000390, Inf),
c("Low", "Middle", "High")))
## `summarise()` has grouped output by 'tag_id'. You can override using the
## `.groups` argument.
Specific_tag_NA_p <- spec_NA_df %>%
ggplot(aes(x = reorder(as.factor(tag_id), per), y = per,
label = percent(per %>% round(4)))) +
geom_col(alpha = 0.7) +
geom_col(data = subset(spec_NA_df, group == "High"), fill = "darkred") +
geom_col(data = subset(spec_NA_df, per > 0.00065),
fill = "darkred", color = "black", size = 1) +
geom_text(vjust = -0.5, # nudge above top of bar
size = 3) +
geom_hline(aes(yintercept = mean(per)), color = "blue",
alpha = 0.5, linetype = "dashed") +
labs(title = "NA Percentage per Tag",
x = "Tag ID", y = "Percentage") +
scale_y_continuous(labels = percent_format()) +
theme_minimal() +
facet_grid(facets = . ~ group, scales = "free") +
theme(strip.text.x = element_blank(),
plot.title = element_text(hjust = 0.5),
text = element_text(size = 20))
spec_NA_df_exp <- NA_detect_df %>%
group_by(experiment_id, condition) %>%
summarize(n = n()) %>%
mutate(per = n/sum(n)) %>%
filter(condition == "NA")
## `summarise()` has grouped output by 'experiment_id'. You can override using the
## `.groups` argument.
Specific_exp_NA_p <- spec_NA_df_exp %>%
ggplot(aes(x = reorder(as.factor(experiment_id), per), y = per,
label = percent(per %>% round(4)))) +
geom_col(alpha = 0.7) +
geom_col(data = subset(spec_NA_df_exp, per > mean(per)),
fill = "darkred") +
geom_col(data = subset(spec_NA_df_exp, experiment_id == 16),
fill = "darkred", color = "black", size = 1) +
geom_text(vjust = -0.5, # nudge above top of bar
size = 3) +
geom_hline(aes(yintercept = mean(per)), color = "blue",
alpha = 0.5, linetype = "dashed") +
labs(title = "NA Percentage per Experiment",
x = "Experiment ID", y = "Percentage") +
scale_y_continuous(labels = percent_format()) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
text = element_text(size = 20))
#ggsave("Overall_NA_p.pdf", Overall_NA_p, width = 15)
#ggsave("Specific_tag_NA_p.pdf", Specific_tag_NA_p, width = 15)
#ggsave("Specific_exp_NA_p.pdf", Specific_exp_NA_p, width = 15)
Overall_NA_p
Specific_tag_NA_p
Specific_exp_NA_p
Short summary: Only 0.04% of the data are NAs, where no specific trend can be found within tag ID or experiment ID. Only some seem to be more concerning than others (Tag ID: 9, 55, 57).
The next plots introduce the interaction between:
on NA Percentage in the data.
# Investigate on tag x person
## dataframe for specific conditions
spec_interact_detect_df <- NA_detect_df %>%
group_by(tag_id, person_id, condition) %>%
summarize(n = n()) %>%
mutate(per = n/sum(n)) %>%
filter(condition == "NA") %>%
ungroup()
## `summarise()` has grouped output by 'tag_id', 'person_id'. You can override
## using the `.groups` argument.
tag_person_p <- spec_interact_detect_df %>%
arrange(per) %>%
mutate(Color = ifelse(per < mean(per), "grey", "darkred")) %>%
ggplot(aes(x = 1:length(per), y = per, color = Color)) +
geom_col() +
geom_hline(aes(yintercept = mean(per)), color = "blue",
linetype = "dashed") +
scale_color_identity() +
labs(title = "NA Percentage per Tag x Person",
y = "Percentage") +
scale_y_continuous(labels = percent_format()) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
text = element_text(size = 20),
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
## Plot
top_tag_person_p <- spec_interact_detect_df %>%
ungroup() %>%
top_n(n = 20, per) %>%
ggplot(aes(x = reorder(as.factor(person_id), per), y = per,
label = paste("Tag", tag_id))) +
# label = percent(per %>% round(4)))) +
geom_col() +
geom_col(data = subset(spec_interact_detect_df, per > 0.01),
fill = "darkred") +
geom_text(vjust = -0.5, # nudge above top of bar
size = 3) +
scale_y_continuous(labels = percent_format()) +
labs(title = "Top 20 NA Percentage per Tag x Person",
x = "Person ID", y = "Percentage") +
scale_y_continuous(labels = percent_format()) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
text = element_text(size = 20))
## Scale for 'y' is already present. Adding another scale for 'y', which will
## replace the existing scale.
#ggsave("tag_person_p.pdf", tag_person_p, width = 15)
#ggsave("top_tag_person_p.pdf", top_tag_person_p, width = 15)
tag_person_p
top_tag_person_p
spec_interact_detect_df %>%
filter(per > mean(per)) %>%
count(tag_id)
## # A tibble: 28 × 2
## tag_id n
## <dbl> <int>
## 1 3 1
## 2 5 4
## 3 6 1
## 4 7 3
## 5 8 1
## 6 9 8
## 7 10 1
## 8 13 1
## 9 14 2
## 10 21 2
## # … with 18 more rows
When investigating the top few person ID based on the results we saw before: that have higher NA percentage than the average, we can notice that Tag 9 do seem to recur a lot. The last table shows Tag 9 has occurred for 9 times, and other tags occur around 1 - 3 times in average. Keep in mind that person ID 378, 263, and 655 have the most NAs here.
Additional exploration with experiment ID x person ID
exp_NA_detect_df <- NA_detect_df %>%
group_by(experiment_id, person_id, condition) %>%
summarize(n = n()) %>%
mutate(per = n/sum(n)) %>%
filter(condition == "NA") %>%
ungroup() %>%
top_n(20, per)
## `summarise()` has grouped output by 'experiment_id', 'person_id'. You can
## override using the `.groups` argument.
exp_person_p <- exp_NA_detect_df %>%
ggplot(aes(x = reorder(as.factor(person_id), per), y = per,
label = paste("Exp", experiment_id))) +
geom_col() +
geom_col(data = subset(exp_NA_detect_df, per > mean(per)),
fill = "darkred") +
geom_text(vjust = -0.5, # nudge above top of bar
size = 3) +
guides(fill = guide_legend(title = "Experiment ID")) +
labs(title = "NA Percentage per Experiment x Person",
x = "Experiment ID", y = "Percentage") +
scale_y_continuous(labels = percent_format()) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5),
text = element_text(size = 20))
# ggsave("exp_person_p.pdf", exp_person_p, width = 15)
exp_person_p
When exploring the data with experiment x person ID, same person ID (person ID 378, 263, and 655) can be found to be the ones that have highest NA percentage.
Next, we explore the Tag ID & Person ID counts to see how many NAs we can expect per certain amount of data points.
counts_NA_p <- NA_detect_df %>%
drop_na() %>%
group_by(tag_id, condition) %>%
summarize(n = n()) %>%
ungroup() %>%
ggplot(aes(x = reorder(as.factor(tag_id), n), y = n, fill = condition,
label = paste(ifelse(condition == "NA", n, "")))) +
geom_col(position = "stack") +
geom_text(vjust = -0.5) +
scale_fill_manual(values = c("darkred", "lightgrey")) +
guides(fill = guide_legend(title = "Condition")) +
labs(title = "Number of Data points in Each Tag ID",
x = "Tag ID", y = "Counts") +
theme_minimal() +
theme(legend.position = "top",
plot.title = element_text(hjust = 0.5),
text = element_text(size = 20))
## `summarise()` has grouped output by 'tag_id'. You can override using the
## `.groups` argument.
person_counts_NA_p <- NA_detect_df %>%
drop_na() %>%
group_by(person_id, condition) %>%
summarize(n = n()) %>%
mutate(per = prop.table(n) * 100) %>%
filter(condition == "NA") %>%
ungroup() %>%
arrange(n) %>%
top_n(50, n) %>%
ggplot(aes(x = 1:length(n), y = n, fill = condition,
label = n)) +
geom_col(fill = "darkred") +
geom_text(vjust = -0.5) +
labs(title = "Number of NA Data points in Person",
y = "Counts") +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5),
text = element_text(size = 20),
axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
## `summarise()` has grouped output by 'person_id'. You can override using the
## `.groups` argument.
#ggsave("counts_NA_p.pdf", counts_NA_p, width = 15)
#ggsave("person_counts_NA_p.pdf", person_counts_NA_p, width = 15)
# summary of counts
NA_detect_df %>%
drop_na() %>%
group_by(person_id, condition) %>%
summarize(n = n()) %>%
split(.$condition) %>%
map(summary)
## `summarise()` has grouped output by 'person_id'. You can override using the
## `.groups` argument.
## $`NA`
## person_id condition n
## Min. : 39.0 Length:307 Min. : 1.000
## 1st Qu.:260.5 Class :character 1st Qu.: 1.000
## Median :399.0 Mode :character Median : 1.000
## Mean :390.6 Mean : 2.394
## 3rd Qu.:536.5 3rd Qu.: 2.000
## Max. :690.0 Max. :37.000
##
## $Normal
## person_id condition n
## Min. : 30.0 Length:611 Min. : 76
## 1st Qu.:229.5 Class :character 1st Qu.:3016
## Median :384.0 Mode :character Median :3408
## Mean :373.8 Mean :3237
## 3rd Qu.:537.5 3rd Qu.:3682
## Max. :690.0 Max. :5384
counts_NA_p
person_counts_NA_p
The summary table shows we can expect around 2.4 missing data points per 3237 data points. The NAs seem to be increasing as the size of the total amount of data increases. You can expect around 0.04 to 0.07 % NAs in the data.
Below, we create a plot of the deviance in location when people are assumed to be stationary for each tablet location.
As it can be seen from the plots of the location deviance, the pattern of noise differ per tablet location. Although warping of space towards the centre was expected, through visual inspection it seems that warping mainly occurs at the top left (tablet IDs 2, 3) and bottom right corners (tablet IDs 6, 7, 8) of the field.
# Change the tablet positions to be comparable to the deviance data
plot_data <- read_csv("/Users/noahhatakeyama/Documents/iddc_bds_R/tenjin_intel_iddc/data-for-report.csv")
## Rows: 1052925 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): person_id, id, tag_id, experiment_id, x, y, speed_x, speed_y, spe...
## lgl (2): slow, at_tablet
## dttm (1): timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tab_deviance <- read_csv("data-for-report.csv") %>%
group_by(at_tablet_id) %>%
summarise(mu_y=mean(y), mu_x=mean(x)) %>%
left_join(tablet, by=c("at_tablet_id" = "tablet_id")) %>%
mutate(deviance_x = x - mu_x,
deviance_y = y - mu_y)
## Rows: 1052925 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): person_id, id, tag_id, experiment_id, x, y, speed_x, speed_y, spe...
## lgl (2): slow, at_tablet
## dttm (1): timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# plot for each tablet
for (i in 1:8) {
t <- tab_deviance %>%
filter(at_tablet_id == i)
g <- plot_data %>%
filter(at_tablet_id == i) %>%
ggplot(aes(deviance_x, deviance_y)) +
geom_hex() +
geom_point(data=t, shape=8, color="pink", size=10) +
scale_fill_continuous(type = "viridis") +
ggtitle(paste("Tablet", i)) +
theme(axis.title.x = element_text("")) +
xlab("x-deviance") + ylab("y-deviance") +
guides(fill=guide_colorbar(title="Count")) +
theme(plot.title = element_text(hjust = 0.5, size = 25),
axis.title = element_text(size = 15))
plot(g)
# save each plots
ggsave(paste0("Tablet", i, ".png"), g, width = 10, height = 9)
}
Below, the correlation between the deviance in x and y is computed. It is visible that all correlation coefficients are significant. This is expected given that the sample size is very large.
Interestingly, all but one coefficient is around 0. Data at tablet 6 has a correlation of 0.5. This could also be due to the fact that tablet 6 is on the trajectory to the central point from tablet 9. This is further supported by the fact that there are many data points behind the tablet.
cor_res <- plot_data %>%
group_by(at_tablet_id) %>%
group_split() %>%
lapply(function(x) list(x$at_tablet_id[1], cor.test(x$deviance_x, x$deviance_y)))
cor_res
## [[1]]
## [[1]][[1]]
## [1] 1
##
## [[1]][[2]]
##
## Pearson's product-moment correlation
##
## data: x$deviance_x and x$deviance_y
## t = 8.177, df = 114234, p-value = 2.939e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.01838997 0.02998104
## sample estimates:
## cor
## 0.02418632
##
##
##
## [[2]]
## [[2]][[1]]
## [1] 2
##
## [[2]][[2]]
##
## Pearson's product-moment correlation
##
## data: x$deviance_x and x$deviance_y
## t = -50.288, df = 130977, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1429391 -0.1323131
## sample estimates:
## cor
## -0.1376301
##
##
##
## [[3]]
## [[3]][[1]]
## [1] 3
##
## [[3]][[2]]
##
## Pearson's product-moment correlation
##
## data: x$deviance_x and x$deviance_y
## t = -22.863, df = 141494, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06585837 -0.05547579
## sample estimates:
## cor
## -0.06066872
##
##
##
## [[4]]
## [[4]][[1]]
## [1] 4
##
## [[4]][[2]]
##
## Pearson's product-moment correlation
##
## data: x$deviance_x and x$deviance_y
## t = 12.198, df = 134159, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.02793736 0.03862752
## sample estimates:
## cor
## 0.03328339
##
##
##
## [[5]]
## [[5]][[1]]
## [1] 5
##
## [[5]][[2]]
##
## Pearson's product-moment correlation
##
## data: x$deviance_x and x$deviance_y
## t = 73.372, df = 138578, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1883049 0.1984411
## sample estimates:
## cor
## 0.1933781
##
##
##
## [[6]]
## [[6]][[1]]
## [1] 6
##
## [[6]][[2]]
##
## Pearson's product-moment correlation
##
## data: x$deviance_x and x$deviance_y
## t = 204.67, df = 127036, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4938280 0.5020988
## sample estimates:
## cor
## 0.4979747
##
##
##
## [[7]]
## [[7]][[1]]
## [1] 7
##
## [[7]][[2]]
##
## Pearson's product-moment correlation
##
## data: x$deviance_x and x$deviance_y
## t = -69.884, df = 134017, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1926698 -0.1823385
## sample estimates:
## cor
## -0.1875093
##
##
##
## [[8]]
## [[8]][[1]]
## [1] 8
##
## [[8]][[2]]
##
## Pearson's product-moment correlation
##
## data: x$deviance_x and x$deviance_y
## t = 20.263, df = 132414, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.05022648 0.06096549
## sample estimates:
## cor
## 0.05559759
The pattern of missing data was largely consistent across persons. Therefore, random measurement error of 0.04% can be expected for future similar setups.
The analysis of noise was based on heuristics. It would greatly benefit from a few ground-truth data points. Meaning a precise location where it is known that an individual is stationary at some given time, to increase the robustness of any conclusions.
Since problems seem to surge at the extremities of the field, increasing the distance of the anchors to create a larger field and collection the majority of the data in the centre could increase the quality of the data.
Additionally, the clients can set up stickers or makrs on the floor for each tablet location, so that children can be aware of the exact location to stand or wait in. This will increase precision in location data.